정리

구분	Throw(fraud_rate)	split(test_fraud_rate)	theta	gamma
시도1	0.3	0.05	8.028000e+04	0.3
시도2	0.3	0.05	8.028000e+04	0.2
시도3	0.3	0.05	9.028000e+04	0.2
시도4	0.3	0.05	7.028000e+04	0.2
시도5	0.3	0.005	7.028000e+04	0.3
시도6	0.3	0.005	8.028000e+04	0.2
시도7	0.3	0.005	9.028000e+04	0.2
시도8	0.3	0.005	7.028000e+04	0.2

# lst = [results1,results2,results3,results4,results5,results6,results7,results8]
# pd.concat(lst)

	accuracy_score	precision_score	recall_score	f1_score	roc_auc_score
1	0.970862	0.639821	0.953333	0.765730	0.962559
2	0.968365	0.619565	0.950000	0.750000	0.959665
3	0.970030	0.635135	0.940000	0.758065	0.955804
4	0.967865	0.616558	0.943333	0.745718	0.956244
5	0.969530	0.627451	0.960000	0.758893	0.965016
6	0.970196	0.635955	0.943333	0.759732	0.957471
7	0.970529	0.633987	0.970000	0.766798	0.970279
8	0.968531	0.618337	0.966667	0.754226	0.967648

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv

    def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
        df1 = df[df['is_fraud'] == 1].copy()
        df0 = df[df['is_fraud'] == 0].copy()
        df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
        df0_down = df0.sample(frac=df0_downsample, random_state=42)
        df_p = pd.concat([df1, df0_down])
        return df_p
    
    def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
        n = len(data_frame)
    
        # 사기 거래와 정상 거래를 분리
        fraud_data = data_frame[data_frame['is_fraud'] == 1]
        normal_data = data_frame[data_frame['is_fraud'] == 0]

        # 테스트 데이터 크기 계산
        test_samples = int(test_fraud_rate * (n * test_rate))
        remaining_test_samples = int(n * test_rate) - test_samples
    
        # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
        test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
        test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

        # 테스트 데이터 합치기
        test_data = pd.concat([test_normal_data, test_fraud_data])

        # 훈련 데이터 생성
        train_data = data_frame[~data_frame.index.isin(test_data.index)]

        return train_data, test_data
    
    def concat(df_tr, df_tst):   
        df = pd.concat([df_tr, df_tst])
        train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
        test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
        mask = (train_mask, test_mask)
        return df, mask
        
    def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
        
    def compute_time_difference(group):
        n = len(group)
        result = []
        for i in range(n):
            for j in range(n):
                time_difference = abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
                result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
        return result

    def edge_index(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
        # filename = f"edge_index{str(unique_col).replace(' ', '').replace('_', '')}.npy"  # 저장
        # np.save(filename, edge_index)
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index


    
    def gcn_data(df):
        x = torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
        y = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
        data = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
        return data

class GCN1(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 32)
        self.conv2 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

def train_and_evaluate_model(data, model, optimizer, num_epochs=400):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
    
    model.eval()
    pred = model(data).argmax(dim=1)
    yyhat = pred[data.test_mask]
    
    return yyhat

# # 모델과 옵티마이저 생성
# model = GCN1()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# # 함수 호출
# yyhat = train_and_evaluate_model(data, model, optimizer)

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))

(throw 0.3 /split 0.05)

df = throw(fraudTrain, 0.3)
df_tr, df_tst = split_dataframe(df, 0.05)
df2, mask = concat(df_tr, df_tst)
df2['index'] = df2.index
df3 = df2.reset_index()

df3.is_fraud.mean()

0.3

df_tst.is_fraud.mean()

0.04995004995004995

시도1

edge_index = edge_index(df3,'cc_num', 8.028000e+04, 0.3)
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results1=evaluation(yy,yyhat)

시도2

edge_index = edge_index(df3,'cc_num', 8.028000e+04, 0.2)
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results2=evaluation(yy,yyhat)

시도3

edge_index = edge_index2(df3,'cc_num', 9.028000e+04, 0.2) 
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results3=evaluation(yy,yyhat)

시도4

edge_index = edge_index2(df3,'cc_num', 7.028000e+04, 0.2) 
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results4=evaluation(yy,yyhat)

(throw 0.3 /split 0.005)

df = throw(fraudTrain, 0.3)
df_tr, df_tst = split_dataframe(df, 0.005)
df2, mask = concat(df_tr, df_tst)
df2['index'] = df2.index
df3 = df2.reset_index()

df3.is_fraud.mean()

0.3

df_tst.is_fraud.mean()

0.004995004995004995

시도5

edge_index = edge_index(df3,'cc_num', 8.028000e+04, 0.3)
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results5=evaluation(yy,yyhat)

시도6

edge_index = edge_index(df3,'cc_num', 8.028000e+04, 0.2)
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results6=evaluation(yy,yyhat)

시도7

edge_index = edge_index2(df3,'cc_num', 9.028000e+04, 0.2) 
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results7=evaluation(yy,yyhat)

시도8

edge_index = edge_index2(df3,'cc_num', 7.028000e+04, 0.2) 
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat = train_and_evaluate_model(data, model, optimizer)
results8=evaluation(yy,yyhat)